Import Packages

Specify Paths

# inputs
dataset_path <- "./Data/"
function_path <- "./Functions/"

# outputs
itrt_plot_path <- "./Output/InteractivePlot/"
sttc_plot_path <- "./Output/StaticPlot/"
out_data_path <- "./Output/Data/"

Import Dataset

daily_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_daily_data.csv")
    )

summary_covid <- import(
    paste0(dataset_path, "worldometer_coronavirus_summary_data.csv")
    )

head(daily_covid)
##         date     country cumulative_total_cases daily_new_cases active_cases
## 1 2020-02-15 Afghanistan                      0              NA            0
## 2 2020-02-16 Afghanistan                      0              NA            0
## 3 2020-02-17 Afghanistan                      0              NA            0
## 4 2020-02-18 Afghanistan                      0              NA            0
## 5 2020-02-19 Afghanistan                      0              NA            0
## 6 2020-02-20 Afghanistan                      0              NA            0
##   cumulative_total_deaths daily_new_deaths
## 1                       0               NA
## 2                       0               NA
## 3                       0               NA
## 4                       0               NA
## 5                       0               NA
## 6                       0               NA
head(summary_covid)
##       country     continent total_confirmed total_deaths total_recovered
## 1 Afghanistan          Asia          158275         7367          145750
## 2     Albania        Europe          213257         3228          202077
## 3     Algeria        Africa          220415         6310          151347
## 4     Andorra        Europe           25289          141           21511
## 5      Angola        Africa           86636         1789           67477
## 6    Anguilla North America            1777            6            1702
##   active_cases serious_or_critical total_cases_per_1m_population
## 1         5158                1124                          3932
## 2         7952                  23                         74227
## 3        62758                  34                          4893
## 4         3637                  31                        326512
## 5        17370                   7                          2518
## 6           69                  NA                        116869
##   total_deaths_per_1m_population total_tests total_tests_per_1m_population
## 1                            183      826810                         20541
## 2                           1124     1495002                        520354
## 3                            140      230861                          5125
## 4                           1820      249838                       3225714
## 5                             52     1296669                         37686
## 6                            395       51382                       3379283
##   population
## 1   40250878
## 2    2873049
## 3   45046063
## 4      77452
## 5   34407243
## 6      15205

Data Cleaning

daily_covid <-
    daily_covid %>%
    replace(is.na(.), 0) %>% 
    mutate(date = as.Date(date))



(x <- subset(daily_covid, is.na(cumulative_total_deaths)) %>% group_by(country) %>% summarise(n=length(country)) %>% select(country))
## # A tibble: 0 x 1
## # ... with 1 variable: country <chr>
(y <- pull(x, country))
## character(0)
(subset(daily_covid, country == y))
## [1] date                    country                 cumulative_total_cases 
## [4] daily_new_cases         active_cases            cumulative_total_deaths
## [7] daily_new_deaths       
## <0 rows> (or 0-length row.names)

Data Manipulation and visualization

Generic Questions and Answers

Question 1

What is the overview of covid cases during that period of time?

## global percentage of death, active case and recovered ##

# sum vertically
categories <- c("total_deaths", "total_recovered", "active_cases")
category <- str_replace_all(categories, pattern =  "_", replacement = " ")
category <- str_to_title(category)

data <- 
    summary_covid[, categories] %>%
    colSums(na.rm = T)
data <- data.frame(
  category=category,
  count=data
)

data$prettyCount <- prettyNum(data$count, big.mark = ",", scientific = F)

# Compute percentages
data$fraction <- data$count / sum(data$count)

# Compute the cumulative percentages (top of each rectangle)
data$ymax <- cumsum(data$fraction)

# Compute the bottom of each rectangle
data$ymin <- c(0, head(data$ymax, n=-1))

# Compute label position
data$labelPosition <- (data$ymax + data$ymin) / 2

# Compute display percentages
data$prettyFraction <- percent(data$fraction)

# Make the plot
q1 <- 
    ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=category)) +
    geom_rect() +
    geom_text( 
        x=4.3, 
        aes(y=labelPosition, label=prettyCount, color=category, fontface="bold"), 
        size=3.5
    ) + # x here controls label position (inner / outer)
    geom_text(
        x=3.5, 
        aes(y=labelPosition, label=prettyFraction, fontface="bold"), 
        color="white",
        size=4
    ) +
    scale_fill_brewer(palette="Set2") +
    scale_color_brewer(palette="Set2") +
    coord_polar(theta="y") +
    xlim(c(2, 4)) +
    theme_void() +
    annotate(
        geom = "text", 
        x = 2, 
        y = 0, 
        colour = "#eba834",
        label = paste0(
            "Total Cases\n", 
            prettyNum(sum(data$count), big.mark = ",", scientific = F)
        )
    )

ggsave(paste0(sttc_plot_path, "q1_pie.png"))
## Saving 7 x 5 in image
q1

Question 2

What are the general scale of infection within different continent or country?

## comparison of cases between different continent ##

q2 <- 
    summary_covid %>% # data
    select(country:active_cases) %>%
    group_by(continent) %>% # group_by 
    filter(total_confirmed > quantile(total_confirmed, 0.7)) %>% # removing small cases
    ungroup() %>% 
    group_by(continent, country) %>% 
    # turning 3 columes into sub sub group (wide to long conversion)
    gather(category, count, total_recovered, active_cases, total_deaths, factor_key=T) %>% 
    ungroup() %>% 
    mutate(category = factor(category, labels = c("Recovered", "Active Cases", "Deaths"))) %>% 
    treemap( index=c("continent","country","category"),
             vSize="count",
             type="index",
             palette = "Set2",
             title = "Group by continent top 70 percentile confirmed cases",
             align.labels=list(
                 c("center", "center"),
                 c("left", "top"),
                 c("left", "bottom")
             )
        )

itrt_q2 <- d3tree2( q2 ,  rootname = "Group by continent top 70 percentile confirmed cases" )

saveWidget(itrt_q2, file = paste0(itrt_plot_path, "q2"))
itrt_q2

Question 3

How many people suffered from covid?

# overview of accumulated cases vs date for all the country

# global stacked area plot
data <- 
    daily_covid %>% 
    
    group_by(date) %>% 
    summarise(
        cumulative_total_cases = sum(cumulative_total_cases, na.rm = T),
        cumulative_total_deaths = sum(cumulative_total_deaths, na.rm = T),
    ) %>% 
    
    gather(categories, count,
           cumulative_total_cases, cumulative_total_deaths) %>% 
    
    # adding this section of code, the dataframe looks fine
    rowwise() %>% 
    mutate(text = 
               paste(
                   last(strsplit("abc_abc1_abc2", "_")[[1]]),
                   "Count:", count,
                   "\nDate:", date
               )
    ) %>% 
    # section end
    
    arrange(date) # this is just to check if text is appended properly
    # group_by(date, categories)

# write.csv(data, paste0(out_data_path, "temp.csv"))
# (subset(data, is.na(text))) # enable this to check if something is na

facet_labels <- c(
    'cumulative_total_cases'="Cumulative Cases",
    'cumulative_total_deaths'="Cumulative Deaths"
)

q3 <-          # I can't use any function in the text argument \|/
    ggplot(data, aes(x=date, y=count, group=categories, fill=categories, text = text)) +
    geom_area() +
    facet_wrap(~categories, scales = "free_y",  labeller = as_labeller(facet_labels)) +
    scale_fill_viridis(discrete = T, option="B", begin = 0.3, end = 0.7) +
    scale_x_date(date_labels = "%b %Y") +
    scale_y_continuous(labels = unit_format(unit = "M", scale = 1e-6)) +
    theme(legend.position="none") +
    ggtitle("Cumulative Covid Cases") +
    # scale_fill_discrete(name = "Categories", labels = c("Cumulative Cases", "Cumulative Death Cases")) +
    ylab("Covid Cases") +
    xlab("Date") +
    theme_ipsum() +
    theme(legend.position="none", axis.text.x = element_text(angle=45, hjust = 1))
 
q3

itrt_q3 <- ggplotly(q3, tooltip = "text")
itrt_q3
# hand picked countries multiple of stacked area plots

Question 4

Which country has the most cases?

# ranking of cases for the top n countries
# hist

Question 5

How active cases have change during that period of time for each country?

# video hist

More in depth Questions

Question 1

We can easily tell from previous plots that most of the cases are from big countries. What is the relation between population and Covid cases?

# we can easily tell from previous plots that most of the cases are from big
# countries. Now, I'm curious about the relation between population & Covid cases

q11 <- summary_covid %>%
    
    # Reorder countries to having big bubbles at the back
    arrange(desc(population)) %>%
    
    # prepare text for tooltip
    mutate(text = 
               paste0(
                   "Country: ", country, 
                   "\nPopulation: ", commaNum(population), 
                   "\nTotal Cases:\t", commaNum(total_confirmed), 
                   "\nTotal Tests\t", commaNum(total_tests)
               )
           ) %>%
    
    ggplot( 
        aes(
            x = population, 
            y = total_confirmed, 
            fill = continent, 
            size = total_tests, 
            text = text
        )
    ) +
    geom_point(alpha=0.5, color = "black", shape = 21, na.rm = T) +
    scale_x_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 1e+3 * 10^(seq(0,20,2)),
    ) +
    scale_y_log10(
        labels = unit_format(unit = "M", scale = 1e-6),
        breaks = 10^(seq(1,21,2)),
    ) +
    scale_size(range = c(2, 25), name="Total Tests (M), Size") +
    labs(fill = 'Continent, Color') +
    scale_fill_viridis(discrete=T, option = "D") +
    coord_cartesian(clip = "off") +
    ylab("Covid Cases (M), log10(n)") +
    xlab("Population (M), log10(n)") +
    theme_bw()

ggsave(paste0(sttc_plot_path, "q11_bubble.png"))
## Saving 12 x 6 in image
# turn interactive ggplot with plotly and save it
itrt_q11 <- ggplotly(q11, tooltip="text")
saveWidget(itrt_q11, file = paste0(itrt_plot_path, "q11"))

q11

itrt_q11

Question 2

Which country did well in this pandemic war?

Question 3

can weather, geometric position affect the spread of covid?

Question 4

an even more in depth question, we’ve seen that population definitely affect how covid spread, let’s check and see why social distancing is needed.

Question 5

How did the R rate behave starting from the beginning of covid?

clean up

# pacman::p_unload(all)